Socal Deprivation Index Exploration

 

The social deprivation index (SDI) is an effort to generate a scoring system of socioeconomic factors using US census data from the American Community Survey. The final SDI is a composite measure of percent living in poverty, percent with less than 12 years of education, percent single-parent households, the percentage living in rented housing units, the percentage living in the overcrowded housing unit, percent of households without a car, and percentage nonemployed adults under 65 years of age. For more details please see the webpage link.

In this webpage we will explore the SDI variable by start location and determine which summary statistic to use to coalesce by neighborhood.

 

Load Data

citibike_df = read_csv("./citibike_clean/citibike_clean.csv")

head(citibike_df) |>
  knitr::kable()
bikeid user_type gender age start_time stop_time start_station_latitude start_station_longitude end_station_latitude end_station_longitude start_station_id start_station_name start_zipcode start_uhf34_neighborhood end_station_id end_station_name end_zipcode end_uhf34_neighborhood start_sdi_score start_percent_overweight start_aq end_sdi_score end_percent_overweight end_aq end_borough start_borough
39213 Subscriber Male 51 2019-09-01 00:00:01 2019-09-01 00:05:29 40.73056 -73.97398 40.73222 -73.98166 3733 Avenue C & E 18 St 10009 Union Square, Lower Manhattan 504 1 Ave & E 16 St 10009 Union Square, Lower Manhattan 88 40.5 8.67 88 40.5 8.67 Manhattan Manhattan
21257 Customer Unknown 50 2019-09-01 00:00:04 2019-09-01 00:19:09 40.68292 -73.99318 40.69308 -73.97179 3329 Degraw St & Smith St 11217 Downtown Heights Slope 270 Adelphi St & Myrtle Ave 11238 Bedford Stuyvesant Crown Heights 74 50.8 7.44 70 62.9 6.61 Brooklyn Brooklyn
15242 Customer Unknown 50 2019-09-01 00:00:07 2019-09-01 00:21:40 40.78473 -73.96962 40.76585 -73.98691 3168 Central Park West & W 85 St 10024 Upper West Side 423 W 54 St & 9 Ave 10019 Chelsea Village 41 43.4 7.38 63 38.1 10.02 Manhattan Manhattan
32094 Subscriber Male 27 2019-09-01 00:00:12 2019-09-01 00:10:26 40.74620 -73.98856 40.76030 -73.99884 486 Broadway & W 29 St 10019 Chelsea Village 478 11 Ave & W 41 St 10018 Chelsea Village 63 38.1 10.02 65 38.1 10.02 Manhattan Manhattan
28271 Customer Unknown 50 2019-09-01 00:00:16 2019-09-01 00:08:19 40.70201 -73.92377 40.70624 -73.93387 3775 Suydam St & Knickerbocker Ave 11237 Williamsburg Bushwick 3771 McKibbin St & Bogart St 11206 Williamsburg Bushwick 98 61.8 7.50 100 61.8 7.50 Brooklyn Brooklyn
39424 Customer Unknown 50 2019-09-01 00:00:17 2019-09-01 00:08:27 40.70201 -73.92377 40.70624 -73.93387 3775 Suydam St & Knickerbocker Ave 11237 Williamsburg Bushwick 3771 McKibbin St & Bogart St 11206 Williamsburg Bushwick 98 61.8 7.50 100 61.8 7.50 Brooklyn Brooklyn

 

Summary Statistics

citibike_df |>
  summarise(
    mean_sdi = round(mean(start_sdi_score, na.rm = TRUE), 1),
    sd_sdi = round(sd(start_sdi_score, na.rm = TRUE), 1),
    median_sdi = round(median(start_sdi_score, na.rm = TRUE), 1),
    q1_sdi = round(quantile(start_sdi_score, 0.25, na.rm = TRUE), 1),
    q3_sdi = round(quantile(start_sdi_score, 0.75, na.rm = TRUE), 1),
    iqr_sdi = round(IQR(start_sdi_score, na.rm = TRUE), 1),
    max_sdi = round(max(start_sdi_score, na.rm = TRUE), 1),
    min_sdi = round(min(start_sdi_score, na.rm = TRUE), 1)
  )|>
  pivot_longer(
    cols = c("mean_sdi", "sd_sdi", "median_sdi", "q1_sdi", "q3_sdi", "iqr_sdi", "max_sdi", "min_sdi"),
    names_to = "Statistic",
    values_to = "Value"
  )|>
  knitr::kable()
Statistic Value
mean_sdi 64.9
sd_sdi 21.6
median_sdi 63.0
q1_sdi 49.0
q3_sdi 88.0
iqr_sdi 39.0
max_sdi 100.0
min_sdi 26.0

Since SDI score is on the zipcode level, we next decide if mean or median is the better way to iterate across neighborhoods.

citibike_df |>
  group_by(start_uhf34_neighborhood) |>
  summarise(
    mean_sdi = mean(start_sdi_score, na.rm = TRUE),
    median_sdi = median(start_sdi_score, na.rm = TRUE),
     max_sdi = round(max(start_sdi_score, na.rm = TRUE), 1),
    min_sdi = round(min(start_sdi_score, na.rm = TRUE), 1)
  )|>
  knitr::kable()
start_uhf34_neighborhood mean_sdi median_sdi max_sdi min_sdi
Bedford Stuyvesant Crown Heights 83.89027 83 97 70
Bensonhurst Bay Ridge 66.00000 66 66 66
Central Harlem Morningside Heights 96.12366 97 98 95
Chelsea Village 58.10703 63 70 37
Coney Island Sheepshead Bay 78.00000 78 78 78
Downtown Heights Slope 71.09194 74 97 47
East Flatbush Flatbush 88.00000 88 88 88
East Harlem 100.00000 100 100 100
East New York 98.57200 99 99 97
Greenpoint 78.31831 88 88 61
Kingsbridge Riverdale 92.00000 92 92 92
Long Island City Astoria 78.60394 76 85 62
Ridgewood Forest Hills 75.00000 75 75 75
Sunset Park 95.00000 95 95 95
Union Square, Lower Manhattan 72.17706 88 97 30
Upper East Side Gramercy 40.15412 42 50 26
Upper West Side 50.47311 43 74 41
Washington Heights Inwood 97.00000 97 97 97
West Queens 83.50252 84 84 81
Williamsburg Bushwick 97.58794 98 100 96

Mean and median SDI scores are similar by start neighborhood. However median SDI scores tend to be lower and better capture the wide variation in score across neighborhood.

 

Median SDI by Start Zipcode

plot = citibike_df |>
  group_by(start_zipcode) |>
  summarize(median_SDI = median(start_sdi_score)) |>
ggplot(aes(x = reorder(start_zipcode, -median_SDI), y = median_SDI,)) +
  geom_point() +  
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) +
  labs(x = "Start Zipcode", y = "Median SDI")  

print(plot)

It is difficult to discern the variation in start zip codes, but we can see there is a wide range.

 

Median SDI by Start Neighborhood

plot = citibike_df |>
  group_by(start_uhf34_neighborhood) |>
  summarize(median_SDI = median(start_sdi_score)) |>
ggplot(aes(x = reorder(start_uhf34_neighborhood, -median_SDI), y = median_SDI,)) +
  geom_point() +  
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) +
  labs(x = "Start Neighborhood", y = "Median SDI")  

print(plot)


Air Quality Dataset Exploration

 

The air quality index (AQ) originated from NYC Open Data, contributed by the Department of Hygiene and Mental Health and include air quality indexes measured across boroughs in NYC. An exploration of AQ is below.

 

Load Data

#import data
air_quality_df = read_csv("./data/air_quality/Air_Quality_20231126.csv") |>
  janitor::clean_names() |>
  mutate(
    start_date = mdy(start_date),
    year = year(start_date)
  ) |>
  filter(year == "2019")

joined_uhf_34_42 =  read_csv("./data/geocoding/joined_uhf_34_42.csv")

# add zip codes
air_geo =
  air_quality_df |>
  mutate(
    uhf34 = case_when(geo_type_name == "UHF34" ~ "1")
  )

#break up by geocoding system, add zipcodes

# uhf34
air_zip_neighborhoods_34 =
  air_geo |>
  filter(uhf34 == "1") |>
  select(-uhf34, - message)

air_zip_neighborhoods_34 =
  left_join(air_zip_neighborhoods_34, y = (joined_uhf_34_42),
            by = join_by("geo_join_id" == "uhf34")) 

air_final =
  air_zip_neighborhoods_34 |>
  select(-uhf34_neighborhood) |>
filter(!duplicated(unique_id))


head(air_final) |>
  knitr::kable()
unique_id indicator_id name measure measure_info geo_type_name geo_join_id geo_place_name time_period start_date data_value year uhf42 uhf42_neighborhood zip
643475 375 Nitrogen dioxide (NO2) Mean ppb UHF34 207 East Flatbush - Flatbush Summer 2019 2019-06-01 12.75 2019 207 East Flatbush Flatbush 11203
667370 375 Nitrogen dioxide (NO2) Mean ppb UHF34 207 East Flatbush - Flatbush Winter 2019-20 2019-12-01 26.13 2019 207 East Flatbush Flatbush 11203
649819 365 Fine particles (PM 2.5) Mean mcg/m3 UHF34 207 East Flatbush - Flatbush Annual Average 2019 2019-01-01 6.31 2019 207 East Flatbush Flatbush 11203
649858 365 Fine particles (PM 2.5) Mean mcg/m3 UHF34 407 Southwest Queens Annual Average 2019 2019-01-01 6.19 2019 407 Southwest Queens 11414
669642 365 Fine particles (PM 2.5) Mean mcg/m3 UHF34 407 Southwest Queens Winter 2019-20 2019-12-01 7.40 2019 407 Southwest Queens 11414
649886 365 Fine particles (PM 2.5) Mean mcg/m3 UHF34 503504 Southern SI Summer 2019 2019-06-01 7.31 2019 503 Willowbrook 10314

 

Summary Statistics Across 2019

Average Air Quality Index Across Time Periods in 2019

annual_aq = air_final |>
  filter(time_period == "Annual Average 2019") 

annual_aq|>
  ggplot(aes(x = time_period, y = data_value)) +
  geom_boxplot(fill = "skyblue", color = "black", alpha = 0.7) +
  labs(
       x = "Time Period",
       y = "Air Quality Index") +
  theme_minimal()

# Calculate statistics
midpoint <- median(annual_aq$data_value)
low_quantile <- quantile(annual_aq$data_value, 0.25)
top_quantile <- quantile(annual_aq$data_value, 0.75)

Summary of data particles: Fine particles (PM 2.5), Nitrogen dioxide (NO2), and Ozone (O3)

air_final |>
  group_by(name) |>
  summarise(
    min_value = min(data_value, na.rm = TRUE),
    q1 = quantile(data_value, 0.25, na.rm = TRUE),
    median_value = median(data_value, na.rm = TRUE),
    q3 = quantile(data_value, 0.75, na.rm = TRUE),
    max_value = max(data_value, na.rm = TRUE)
  )|>
  knitr::kable()
name min_value q1 median_value q3 max_value
Fine particles (PM 2.5) 5.59 6.8825 7.520 8.1700 11.26
Nitrogen dioxide (NO2) 7.33 14.2125 17.855 22.7925 32.94
Ozone (O3) 24.24 27.9175 29.795 31.4000 37.44
air_final|>
  ggplot(aes(x = data_value, fill = name)) +
  geom_histogram(binwidth = 1, position = "dodge", alpha = 0.7) +
  facet_wrap(~ name, scales = "free") +
  labs(
       x = "Data Value",
       y = "Frequency",
       fill = "Pollutant") +
  theme_minimal()

Evaluation of data particles by Neighborhood

 

Fine Particles by Neighborhoods

Comparison of fine particles by neighborhood

air_particles =
  air_final |>
  filter(name == "Fine particles (PM 2.5)")

air_particles |>
  filter(time_period == "Annual Average 2019") |>
  ggplot(aes(x = reorder(geo_place_name, -data_value), y = data_value)) +
  geom_point() +
  labs(
       x = "Neighborhood",
       y = "Mean Fine particles (mcg/m3)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

Summary statistics of fine particles by Neighborhood

air_final |>
  filter(name == "Fine particles (PM 2.5)") |>
  group_by(geo_place_name) |>
  summarise(
    min_value = min(data_value, na.rm = TRUE),
    q1 = quantile(data_value, 0.25, na.rm = TRUE),
    median_value = median(data_value, na.rm = TRUE),
    q3 = quantile(data_value, 0.75, na.rm = TRUE),
    max_value = max(data_value, na.rm = TRUE)
  )|>
  knitr::kable()
geo_place_name min_value q1 median_value q3 max_value
Bayside Little Neck-Fresh Meadows 6.19 6.750 7.31 7.475 7.64
Bedford Stuyvesant - Crown Heights 6.61 7.210 7.81 7.960 8.11
Bensonhurst - Bay Ridge 6.21 6.885 7.56 7.675 7.79
Borough Park 6.32 6.905 7.49 7.700 7.91
Canarsie - Flatlands 6.15 6.500 6.85 7.280 7.71
Central Harlem - Morningside Heights 7.00 7.280 7.56 7.875 8.19
Chelsea-Village 10.02 10.555 11.09 11.175 11.26
Coney Island - Sheepshead Bay 6.04 6.510 6.98 7.310 7.64
Downtown - Heights - Slope 7.44 7.995 8.55 8.725 8.90
East Flatbush - Flatbush 6.31 6.890 7.47 7.670 7.87
East Harlem 7.11 7.470 7.83 8.050 8.27
East New York 6.56 6.980 7.40 7.710 8.02
Flushing - Clearview 6.61 7.185 7.76 7.790 7.82
Fordham - Bronx Pk 6.68 6.690 6.70 7.235 7.77
Greenpoint 8.56 8.575 8.59 9.260 9.93
Jamaica 6.24 6.845 7.45 7.465 7.48
Kingsbridge - Riverdale 6.37 6.540 6.71 7.245 7.78
Long Island City - Astoria 7.99 8.150 8.31 8.795 9.28
Northeast Bronx 6.66 6.670 6.68 7.190 7.70
Northern SI 6.05 6.680 7.31 7.435 7.56
Pelham - Throgs Neck 6.74 7.010 7.28 7.545 7.81
Ridgewood - Forest Hills 6.56 7.130 7.70 7.805 7.91
Rockaways 5.59 5.855 6.12 6.570 7.02
South Bronx 7.28 7.305 7.33 7.890 8.45
Southeast Queens 6.01 6.585 7.16 7.170 7.18
Southern SI 5.87 6.125 6.38 6.845 7.31
Southwest Queens 6.19 6.795 7.40 7.470 7.54
Sunset Park 7.32 7.745 8.17 8.515 8.86
Union Square-Lower Manhattan 8.67 9.290 9.91 10.045 10.18
Upper East Side-Gramercy 8.98 9.345 9.71 9.890 10.07
Upper West Side 7.38 7.625 7.87 8.210 8.55
Washington Heights 7.11 7.230 7.35 7.760 8.17
West Queens 7.35 7.795 8.24 8.425 8.61
Williamsburg - Bushwick 7.50 7.895 8.29 8.625 8.96

 

Nitrogen Dioxide (NO2) by Neighborhoods

Comparison of NO2 by neighborhood

air_nitrogen =
  air_final |>
  filter(name == "Nitrogen dioxide (NO2)")

air_nitrogen |>
  filter(time_period == "Annual Average 2019") |>
  ggplot(aes(x = reorder(geo_place_name, -data_value), y = data_value)) +
  geom_point() +
  labs(x = "Neighborhood",
       y = "Mean Nitrogen dioxide (ppb)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

Summary statistics of NO2 by Neighborhood

air_final |>
  filter(name == "Nitrogen dioxide (NO2)") |>
  group_by(geo_place_name) |>
  summarise(
    min_value = min(data_value, na.rm = TRUE),
    q1 = quantile(data_value, 0.25, na.rm = TRUE),
    median_value = median(data_value, na.rm = TRUE),
    q3 = quantile(data_value, 0.75, na.rm = TRUE),
    max_value = max(data_value, na.rm = TRUE)
  ) |>
  knitr::kable()
geo_place_name min_value q1 median_value q3 max_value
Bayside Little Neck-Fresh Meadows 12.56 13.605 14.65 17.805 20.96
Bedford Stuyvesant - Crown Heights 14.34 16.015 17.69 22.635 27.58
Bensonhurst - Bay Ridge 13.00 14.350 15.70 19.725 23.75
Borough Park 13.58 15.120 16.66 21.150 25.64
Canarsie - Flatlands 9.24 11.190 13.14 18.000 22.86
Central Harlem - Morningside Heights 16.78 18.240 19.70 23.150 26.60
Chelsea-Village 24.12 24.660 25.20 29.070 32.94
Coney Island - Sheepshead Bay 9.97 11.735 13.50 17.800 22.10
Downtown - Heights - Slope 17.99 18.985 19.98 24.310 28.64
East Flatbush - Flatbush 12.75 14.510 16.27 21.200 26.13
East Harlem 16.73 18.065 19.40 23.080 26.76
East New York 11.97 13.890 15.81 20.655 25.50
Flushing - Clearview 13.61 14.935 16.26 19.590 22.92
Fordham - Bronx Pk 12.17 13.670 15.17 18.880 22.59
Greenpoint 19.22 20.310 21.40 25.335 29.27
Jamaica 13.06 14.285 15.51 18.900 22.29
Kingsbridge - Riverdale 10.39 11.945 13.50 17.385 21.27
Long Island City - Astoria 17.72 18.980 20.24 23.930 27.62
Northeast Bronx 12.28 13.475 14.67 18.290 21.91
Northern SI 12.41 13.290 14.17 17.585 21.00
Pelham - Throgs Neck 12.80 14.135 15.47 18.845 22.22
Ridgewood - Forest Hills 12.77 14.505 16.24 20.750 25.26
Rockaways 7.33 8.685 10.04 13.805 17.57
South Bronx 15.15 16.785 18.42 21.845 25.27
Southeast Queens 12.99 13.745 14.50 17.295 20.09
Southern SI 8.76 9.865 10.97 13.755 16.54
Southwest Queens 11.69 13.415 15.14 19.530 23.92
Sunset Park 16.21 17.335 18.46 22.410 26.36
Union Square-Lower Manhattan 21.16 21.775 22.39 26.550 30.71
Upper East Side-Gramercy 22.29 23.170 24.05 27.945 31.84
Upper West Side 18.84 19.820 20.80 24.455 28.11
Washington Heights 14.03 15.650 17.27 20.940 24.61
West Queens 15.58 17.125 18.67 22.695 26.72
Williamsburg - Bushwick 16.66 18.165 19.67 24.185 28.70

 

Ozone (NO2) by Neighborhoods

Comparison of O3 by neighborhood

air_final |>
  filter(name == "Ozone (O3)") |>
  ggplot(aes(x = reorder(geo_place_name, -data_value), y = data_value)) +
  geom_point() +
  labs(
       x = "Neighborhood",
       y = "Mean Ozone (ppb)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

Summary statistics of O3 by Neighborhood

air_final |>
  filter(name == "Ozone (O3)") |>
  group_by(geo_place_name) |>
  summarise(
    min_value = min(data_value, na.rm = TRUE),
    q1 = quantile(data_value, 0.25, na.rm = TRUE),
    median_value = median(data_value, na.rm = TRUE),
    q3 = quantile(data_value, 0.75, na.rm = TRUE),
    max_value = max(data_value, na.rm = TRUE)
  )|>
  knitr::kable()
geo_place_name min_value q1 median_value q3 max_value
Bayside Little Neck-Fresh Meadows 31.73 31.73 31.73 31.73 31.73
Bedford Stuyvesant - Crown Heights 30.46 30.46 30.46 30.46 30.46
Bensonhurst - Bay Ridge 30.83 30.83 30.83 30.83 30.83
Borough Park 30.62 30.62 30.62 30.62 30.62
Canarsie - Flatlands 34.05 34.05 34.05 34.05 34.05
Central Harlem - Morningside Heights 27.07 27.07 27.07 27.07 27.07
Chelsea-Village 24.24 24.24 24.24 24.24 24.24
Coney Island - Sheepshead Bay 33.33 33.33 33.33 33.33 33.33
Downtown - Heights - Slope 27.61 27.61 27.61 27.61 27.61
East Flatbush - Flatbush 31.43 31.43 31.43 31.43 31.43
East Harlem 27.51 27.51 27.51 27.51 27.51
East New York 32.61 32.61 32.61 32.61 32.61
Flushing - Clearview 31.31 31.31 31.31 31.31 31.31
Fordham - Bronx Pk 29.64 29.64 29.64 29.64 29.64
Greenpoint 27.77 27.77 27.77 27.77 27.77
Jamaica 32.83 32.83 32.83 32.83 32.83
Kingsbridge - Riverdale 28.43 28.43 28.43 28.43 28.43
Long Island City - Astoria 28.36 28.36 28.36 28.36 28.36
Northeast Bronx 30.34 30.34 30.34 30.34 30.34
Northern SI 28.46 28.46 28.46 28.46 28.46
Pelham - Throgs Neck 30.87 30.87 30.87 30.87 30.87
Ridgewood - Forest Hills 31.30 31.30 31.30 31.30 31.30
Rockaways 37.44 37.44 37.44 37.44 37.44
South Bronx 29.16 29.16 29.16 29.16 29.16
Southeast Queens 33.41 33.41 33.41 33.41 33.41
Southern SI 29.64 29.64 29.64 29.64 29.64
Southwest Queens 33.31 33.31 33.31 33.31 33.31
Sunset Park 28.83 28.83 28.83 28.83 28.83
Union Square-Lower Manhattan 25.41 25.41 25.41 25.41 25.41
Upper East Side-Gramercy 24.98 24.98 24.98 24.98 24.98
Upper West Side 25.59 25.59 25.59 25.59 25.59
Washington Heights 27.73 27.73 27.73 27.73 27.73
West Queens 29.95 29.95 29.95 29.95 29.95
Williamsburg - Bushwick 29.34 29.34 29.34 29.34 29.34

 

Overweight Exploration

 

The overweight data contains publicly available data from NYC regarding the percent of people who are overweight per area. An exploration of this dataset is below.

Load Data

overweight_data = citibike_df |>
  select(start_station_latitude,
         start_station_longitude,
         start_uhf34_neighborhood,
         start_zipcode,
         end_station_latitude,
         end_station_longitude,
         end_uhf34_neighborhood,
         start_borough, end_borough,
         end_zipcode,
         start_percent_overweight, 
         end_percent_overweight) |>
  mutate(neighborhood = coalesce(end_uhf34_neighborhood, start_uhf34_neighborhood),
         lat = coalesce(end_station_latitude, start_station_latitude),
         long = coalesce(end_station_longitude, start_station_longitude),
         borough = coalesce(start_borough, end_borough),
         zipcode = coalesce(start_zipcode, end_zipcode),
         overweight = coalesce(start_percent_overweight, end_percent_overweight)) |>
  unique()


head(overweight_data) |>
  knitr::kable()
start_station_latitude start_station_longitude start_uhf34_neighborhood start_zipcode end_station_latitude end_station_longitude end_uhf34_neighborhood start_borough end_borough end_zipcode start_percent_overweight end_percent_overweight neighborhood lat long borough zipcode overweight
40.73056 -73.97398 Union Square, Lower Manhattan 10009 40.73222 -73.98166 Union Square, Lower Manhattan Manhattan Manhattan 10009 40.5 40.5 Union Square, Lower Manhattan 40.73222 -73.98166 Manhattan 10009 40.5
40.68292 -73.99318 Downtown Heights Slope 11217 40.69308 -73.97179 Bedford Stuyvesant Crown Heights Brooklyn Brooklyn 11238 50.8 62.9 Bedford Stuyvesant Crown Heights 40.69308 -73.97179 Brooklyn 11217 50.8
40.78473 -73.96962 Upper West Side 10024 40.76585 -73.98691 Chelsea Village Manhattan Manhattan 10019 43.4 38.1 Chelsea Village 40.76585 -73.98691 Manhattan 10024 43.4
40.74620 -73.98856 Chelsea Village 10019 40.76030 -73.99884 Chelsea Village Manhattan Manhattan 10018 38.1 38.1 Chelsea Village 40.76030 -73.99884 Manhattan 10019 38.1
40.70201 -73.92377 Williamsburg Bushwick 11237 40.70624 -73.93387 Williamsburg Bushwick Brooklyn Brooklyn 11206 61.8 61.8 Williamsburg Bushwick 40.70624 -73.93387 Brooklyn 11237 61.8
40.79127 -73.96484 Upper West Side 10025 40.75020 -73.99093 Chelsea Village Manhattan Manhattan 10001 43.4 38.1 Chelsea Village 40.75020 -73.99093 Manhattan 10025 43.4

Summary Statistics

overweight_data |>  
  summarize(
    mean = mean(overweight, na.rm = TRUE),
    min = min(overweight, na.rm = TRUE),
    max = max(overweight, na.rm = TRUE),
    median = median(overweight, na.rm = TRUE),
    std = sd(overweight, na.rm = TRUE)
  ) |>
  knitr::kable()
mean min max median std
46.86309 36.5 71.2 41.1 10.86188

Percent by Zipcode

Percent of adults classified as overweight or obese, by zipcode

percent_obese =
overweight_data |>
ggplot(aes(x = reorder(zipcode, -overweight), y = overweight,)) +
  geom_point() +  
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) +
  labs( x = "Location", y = "Percent")  

print(percent_obese)

Percent by Neighborhood

Percent of adults classified as overweight or obese, by neighborhood

percent_obese =
overweight_data |>
  group_by(neighborhood) |>
  summarize(overweight = mean(overweight)) |>
ggplot(aes(x = reorder(neighborhood, -overweight), y = overweight,)) +
  geom_point() +  
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) +
  labs( x = "Location", y = "Percent")  

print(percent_obese)

# Create a leaflet map

# Select the necessary columns
map_data <- overweight_data %>%
  select(lat, long, overweight)

map <- leaflet(data = map_data) %>%
  addTiles()  # Add map tiles (you can use different tile providers)

# Add color-coded circles based on overweight percentage
map <- map %>%
  addCircleMarkers(
    radius = 5,  # Adjust the circle size as needed
    fillColor = ~colorFactor("Blues", map_data$overweight)(overweight),
    color = "black",
    fillOpacity = 0.7,
    popup = ~paste("Overweight Percentage:", overweight, "%"),
    label = ~paste("Overweight Percentage:", overweight, "%")
  )

# Display the map
map